{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f11998fb-a205-4b7e-b7a3-2fb33c23f0a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "67bebefc-6789-411e-8832-33629efb2635",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The shape of training data: (1460, 81)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 81 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "4   5          60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n",
       "0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
       "1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   \n",
       "2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   \n",
       "3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
       "4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   \n",
       "\n",
       "  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0   2008        WD         Normal     208500  \n",
       "1   2007        WD         Normal     181500  \n",
       "2   2008        WD         Normal     223500  \n",
       "3   2006        WD        Abnorml     140000  \n",
       "4   2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 81 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('train.csv')\n",
    "print('The shape of training data:', train.shape)\n",
    "train.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e104635e-0060-4bcb-be2c-27e298690e4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The shape of testing data: (1459, 80)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1461</td>\n",
       "      <td>20</td>\n",
       "      <td>RH</td>\n",
       "      <td>80.0</td>\n",
       "      <td>11622</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MnPrv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1462</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>81.0</td>\n",
       "      <td>14267</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Gar2</td>\n",
       "      <td>12500</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1463</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>74.0</td>\n",
       "      <td>13830</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MnPrv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1464</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>78.0</td>\n",
       "      <td>9978</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1465</td>\n",
       "      <td>120</td>\n",
       "      <td>RL</td>\n",
       "      <td>43.0</td>\n",
       "      <td>5005</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>HLS</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>144</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0  1461          20       RH         80.0    11622   Pave   NaN      Reg   \n",
       "1  1462          20       RL         81.0    14267   Pave   NaN      IR1   \n",
       "2  1463          60       RL         74.0    13830   Pave   NaN      IR1   \n",
       "3  1464          60       RL         78.0     9978   Pave   NaN      IR1   \n",
       "4  1465         120       RL         43.0     5005   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \\\n",
       "0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   \n",
       "1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   \n",
       "2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   \n",
       "3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   \n",
       "4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   \n",
       "\n",
       "  MiscVal MoSold  YrSold  SaleType  SaleCondition  \n",
       "0       0      6    2010        WD         Normal  \n",
       "1   12500      6    2010        WD         Normal  \n",
       "2       0      3    2010        WD         Normal  \n",
       "3       0      6    2010        WD         Normal  \n",
       "4       0      1    2010        WD         Normal  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.read_csv('test.csv')\n",
    "print('The shape of testing data:', test.shape)\n",
    "test.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b281e16c-e273-490f-bb32-16f13d1d645d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop('LotFrontage', axis=1, inplace=True)\n",
    "test.drop('LotFrontage', axis=1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bf07803f-a74f-49fe-acfc-0270bb750380",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of numeric features: 36\n",
      "number of categorical features: 43\n"
     ]
    }
   ],
   "source": [
    "#分离数字特征和类别特征\n",
    "num_features = []\n",
    "cate_features = []\n",
    "for col in test.columns:\n",
    "    if test[col].dtype == 'object':\n",
    "        cate_features.append(col)\n",
    "    else:\n",
    "        num_features.append(col)\n",
    "print('number of numeric features:', len(num_features))\n",
    "print('number of categorical features:', len(cate_features))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2d46954d-10b1-4a9c-818e-0cd4b7da678c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#处理掉右下的明显异常值\n",
    "train = train.drop(train[(train['TotalBsmtSF']>6000) & (train['SalePrice']<200000)].index)\n",
    "train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<200000)].index)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4754b563-e6b7-4375-bb7e-31fd1980f4d1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The shape of training data: (1458, 80)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "PoolQC          1452\n",
       "MiscFeature     1404\n",
       "Alley           1367\n",
       "Fence           1177\n",
       "MasVnrType       872\n",
       "FireplaceQu      690\n",
       "GarageType        81\n",
       "GarageYrBlt       81\n",
       "GarageFinish      81\n",
       "GarageQual        81\n",
       "GarageCond        81\n",
       "BsmtExposure      38\n",
       "BsmtFinType2      38\n",
       "BsmtQual          37\n",
       "BsmtCond          37\n",
       "BsmtFinType1      37\n",
       "MasVnrArea         8\n",
       "Electrical         1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('The shape of training data:', train.shape)\n",
    "train_missing = train.isnull().sum()\n",
    "train_missing = train_missing.drop(train_missing[train_missing==0].index).sort_values(ascending=False)\n",
    "train_missing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "baef69b9-6476-46b6-990a-123194f51893",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The shape of testing data: (1459, 79)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "PoolQC          1456\n",
       "MiscFeature     1408\n",
       "Alley           1352\n",
       "Fence           1169\n",
       "MasVnrType       894\n",
       "FireplaceQu      730\n",
       "GarageQual        78\n",
       "GarageYrBlt       78\n",
       "GarageFinish      78\n",
       "GarageCond        78\n",
       "GarageType        76\n",
       "BsmtCond          45\n",
       "BsmtExposure      44\n",
       "BsmtQual          44\n",
       "BsmtFinType2      42\n",
       "BsmtFinType1      42\n",
       "MasVnrArea        15\n",
       "MSZoning           4\n",
       "Functional         2\n",
       "BsmtFullBath       2\n",
       "BsmtHalfBath       2\n",
       "Utilities          2\n",
       "KitchenQual        1\n",
       "TotalBsmtSF        1\n",
       "BsmtUnfSF          1\n",
       "GarageCars         1\n",
       "GarageArea         1\n",
       "BsmtFinSF2         1\n",
       "BsmtFinSF1         1\n",
       "Exterior2nd        1\n",
       "Exterior1st        1\n",
       "SaleType           1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看测试集中各特征的数据缺失个数\n",
    "print('The shape of testing data:', test.shape)\n",
    "test_missing = test.isnull().sum()\n",
    "test_missing = test_missing.drop(test_missing[test_missing==0].index).sort_values(ascending=False)\n",
    "test_missing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a9df88fb-b250-48a9-ad92-54f03a59dd62",
   "metadata": {},
   "outputs": [],
   "source": [
    "none_lists = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType1',\n",
    "              'BsmtFinType2', 'BsmtCond', 'BsmtExposure', 'BsmtQual', 'MasVnrType']\n",
    "for col in none_lists:\n",
    "    train[col] = train[col].fillna('None')\n",
    "    test[col] = test[col].fillna('None')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e04d5e52-73b8-4dd2-a602-b7756a4a4b58",
   "metadata": {},
   "outputs": [],
   "source": [
    "most_lists = ['MSZoning', 'Exterior1st', 'Exterior2nd', 'SaleType', 'KitchenQual', 'Electrical']\n",
    "for col in most_lists:\n",
    "    train[col] = train[col].fillna(train[col].mode()[0])\n",
    "    test[col] = test[col].fillna(train[col].mode()[0])    #注意这里补充的是训练集中出现最多的类别\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "cfa46a7e-178d-4d4e-b8e2-2ae8c5eb74b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "most_lists = ['MSZoning', 'Exterior1st', 'Exterior2nd', 'SaleType', 'KitchenQual', 'Electrical']\n",
    "for col in most_lists:\n",
    "    train[col] = train[col].fillna(train[col].mode()[0])\n",
    "    test[col] = test[col].fillna(train[col].mode()[0])    #注意这里补充的是训练集中出现最多的类别\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7703676e-b513-49d0-8fea-446b791249bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Functional'] = train['Functional'].fillna('Typ')\n",
    "test['Functional'] = test['Functional'].fillna('Typ')\n",
    "\n",
    "train.drop('Utilities', axis=1, inplace=True)\n",
    "test.drop('Utilities', axis=1, inplace=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "184d4632-d04e-4325-82ae-be35a30a21d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "zero_lists = ['GarageYrBlt', 'MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GarageCars', 'GarageArea',\n",
    "              'TotalBsmtSF']\n",
    "for col in zero_lists:\n",
    "    train[col] = train[col].fillna(0)\n",
    "    test[col] = test[col].fillna(0)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "fedc52fc-14f2-4dee-8dcb-815b074e13e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.isnull().sum().any()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0fb6f309-3967-4015-a9a8-5e5af8307a0f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.isnull().sum().any()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6db67baa-0a7b-4116-8faf-8292f3c83a2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of categorical features: 42\n"
     ]
    }
   ],
   "source": [
    "#从存放类别特征的列表去掉'Utilities'\n",
    "cate_features.remove('Utilities')\n",
    "print('The number of categorical features:', len(cate_features))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "dcc2f3f9-4486-4c31-9702-95c599af0e76",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "for col in cate_features:\n",
    "    train[col] = train[col].astype(str)\n",
    "    test[col] = test[col].astype(str)\n",
    "le_features = ['Street', 'Alley', 'LotShape', 'LandContour', 'LandSlope', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', \n",
    "               'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir',\n",
    "               'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']\n",
    "for col in le_features:\n",
    "    encoder = LabelEncoder()\n",
    "    value_train = set(train[col].unique())\n",
    "    value_test = set(test[col].unique())\n",
    "    value_list = list(value_train | value_test)\n",
    "    encoder.fit(value_list)\n",
    "    train[col] = encoder.transform(train[col])\n",
    "    test[col] = encoder.transform(test[col])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bba09284-fad1-409b-9900-e507de0ed24a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2917, 155)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data = pd.concat((train.drop('SalePrice', axis=1), test)).reset_index(drop=True)\n",
    "all_data = pd.get_dummies(all_data, drop_first=True)  #注意独热编码生成的时候要去掉一个维度，保证剩下的变量都是相互独立的\n",
    "all_data.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "00c07bd6-7e4b-407f-9a48-6c05f4564bdc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The shape of training data: (1460, 155)\n",
      "The shape of testing data: (1459, 155)\n"
     ]
    }
   ],
   "source": [
    "trainset = all_data[:1460]\n",
    "traincy = pd.read_csv('train.csv')\n",
    "y=traincy['SalePrice']\n",
    "testset = all_data[1458:]\n",
    "print('The shape of training data:', trainset.shape)\n",
    "print('The shape of testing data:', testset.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a74a57b0-5cba-4019-a86a-942ca8aa1ca8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LinearRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LinearRegression</label><div class=\"sk-toggleable__content\"><pre>LinearRegression()</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LinearRegression()"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "linear_model = LinearRegression()\n",
    "linear_model.fit(trainset, y)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "a9573473-c8fe-492b-9859-2f1a00626bc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "line_pre = linear_model.predict(testset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9a0d8b06-932b-407e-ad2a-eb0ddb8037c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('test.csv')\n",
    "# print(test.shape,line_pre.shape)\n",
    "we = pd.DataFrame({'Id': test['Id'], 'SalePrice': line_pre})\n",
    "we.to_csv('House_Price_submissionMyself.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99c6b27c-d424-4f3a-8f00-c3cd2f34f1d8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
